Collected Datasets Location: Reformed_Data (Folder)
CSV: CSV-Air_Pollution_Data-(Reformed_and_AQI_Values).csv
Website: Website-Air_Pollution_Data-(Reformed_and_AQI_Values).csv
API: API-Air_Pollution_Data-(Reformed_and_AQI_Values).csv
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sb
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from scipy import stats
This class is for changing the midpoint on the color bar for graphs, see definition CSVandAPI_GradientPlot(c)
# Set the Colormap and Center on the Colorbar
class MidpointNormalize(colors.Normalize):
"""
Normalize the colorbar so that diverging bars work there
way from either side from a prescribed midpoint value
e.g. im=ax1.imshow(array, norm=MidpointNormalize(midpoint=0.,vmin=-100, vmax=100))
"""
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
colors.Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y), np.isnan(value))
This function is for initially creating the database tables to load in the CSV data that has been collected
def createSQL_Tables(c):
# CSV Table
c.execute('''CREATE TABLE IF NOT EXISTS CSV_Data (
Measurement_Date DATE,
Measurement_Time TIME,
Station_Code INTEGER,
Country VARCHAR2,
City VARCHAR2,
District VARCHAR2,
Street_Address VARCHAR2,
Latitude INTEGER,
Longitude INTEGER,
SO2 INTEGER,
NO2 INTEGER,
O3 INTEGER,
CO INTEGER,
PM10 INTEGER,
PM25 INTEGER,
AQI_PM25 INTEGER,
AQI_Risk_Level VARCHAR2
)''')
# Website Table
c.execute('''CREATE TABLE IF NOT EXISTS Website_Data (
Measurement_Date DATE,
Country VARCHAR2,
City VARCHAR2,
District VARCHAR2,
PM25 INTEGER,
AQI_PM25 INTEGER,
AQI_Risk_Level VARCHAR2
)''')
# API Table
c.execute('''CREATE TABLE IF NOT EXISTS API_Data (
Measurement_Date DATE,
Country VARCHAR2,
City VARCHAR2,
District VARCHAR2,
Latitude INTEGER,
Longitude INTEGER,
PM10 INTEGER,
PM25 INTEGER,
AQI_PM25 INTEGER,
AQI_Risk_Level VARCHAR2
)''')
conn.commit()
This function is for loading in all the data from the CSV files into the database tables.
def loadData(c):
# Read in AirData
airData_CSV = pd.read_csv('Reformed_Data/CSV-Air_Pollution_Data-(Reformed_and_AQI_Values).csv')
airData_Website = pd.read_csv('Reformed_Data/Website-Air_Pollution_Data-(Reformed_and_AQI_Values).csv')
airData_API = pd.read_csv('Reformed_Data/API-Air_Pollution_Data-(Reformed_and_AQI_Values).csv')
# Load data into SQL tables
airData_CSV.to_sql('CSV_Data', conn, if_exists='replace', index=False)
airData_Website.to_sql('Website_Data', conn, if_exists='replace', index=False)
airData_API.to_sql('API_Data', conn, if_exists='replace', index=False)
print('CSV Data\n')
for row in c.execute('SELECT * FROM CSV_Data limit 5'):
print(row)
print('\n\nWebsite Data\n')
for row in c.execute('SELECT * FROM Website_Data limit 5'):
print(row)
print('\n\nAPI Data\n')
for row in c.execute('SELECT * FROM API_Data limit 5'):
print(row)
This functions plots the histograms, with a normalization curve, of each of the data tables AQI values
def tableDistPlots(c):
tableList = []
for row in c.execute("SELECT name FROM sqlite_master WHERE type='table'"):
tableList += [row[0]]
for table in tableList:
c.execute("SELECT [AQI_(PM2.5)] FROM " + table)
AQI_Values = []
for row in c.fetchall():
AQI_Values += [row[0]]
fig = plt.figure()
fig.set_size_inches(16, 10)
plot = sb.distplot(AQI_Values, fit=stats.norm, kde=False, fit_kws={"label": "Normalization Curve"})
plot.set(title=table + ' Table, AQI Values Histogram', xlabel='AQI Values')
plt.legend()
This function plots a scatter plot of the AQI values by district using the CSV, Website and API database tables
def CSV_Website_API_ScatterPlot(c):
districtList = []
AQI_Values = []
# CSV, Website and API Data
for row in c.execute('''SELECT District, [AQI_(PM2.5)] FROM CSV_Data
UNION SELECT District, [AQI_(PM2.5)] FROM Website_Data
UNION SELECT District, [AQI_(PM2.5)] FROM API_Data'''):
districtList += [row[0]]
AQI_Values += [row[1]]
fig = plt.figure()
fig.set_size_inches(16, 10)
plot = sb.stripplot(x=districtList, y=AQI_Values, alpha=0.7, jitter=True)
plot.set_xticklabels(labels=plot.get_xticklabels(), rotation=65)
plot.set(title='AQI Value by Seoul District', xlabel='District', ylabel='AQI Value')
This function plots a gradient plot of the AQI values by District and Date using the CSV and API database tables
def CSVandAPI_GradientPlot(c):
dateList = []
districtList = []
AQI_Values = []
# CSV and API Data
for row in c.execute('''SELECT Measurement_Date, District, [AQI_(PM2.5)] FROM CSV_Data
UNION SELECT Measurement_Date, District, [AQI_(PM2.5)] FROM API_Data
ORDER BY Measurement_Date'''):
dateList += [row[0]]
districtList += [row[1]]
AQI_Values += [row[2]]
dateList = [dt.datetime.strptime(date, '%m/%d/%Y').date() for date in dateList]
dateList.sort()
fig = plt.figure()
fig.set_size_inches(16, 10)
norm = MidpointNormalize(midpoint=75)
plot = plt.scatter(x=dateList, y=districtList, c=AQI_Values, alpha=0.5, cmap='RdYlGn_r', norm=norm)
cbar = plt.colorbar(plot)
cbar.ax.set_title('AQI Scale')
plt.title('Seoul District AQI Values by Date')
plt.xlabel('Date')
plt.ylabel('Districts')
# Creates SQlite3 database in the same location as this notebook
conn = sqlite3.connect('AirDatabase.db')
c = conn.cursor()
createSQL_Tables(c)
loadData(c)
tableDistPlots(c)
This plot shows the overall distribution of AQI values by Seoul District from all three data tables, the higher the AQI Value the worse the air pollution is.
# Higher AQI Value = Bad
CSV_Website_API_ScatterPlot(c)
This plot shows the distribution of AQI values by Seoul District and Date from the CSV and API data tables, using a color gradient scale (based on the AQI value thresholds from the EPA) to show how dangerous that particular AQI value is: Green is Good and Red is Bad.
The gap in the graph is the gap in time between the CSV and API's data collection because the API data was collected far more recently that than the collected CSV data.
# Green Good, Red Bad
CSVandAPI_GradientPlot(c)